* Prepare data 2

********************************************************************************
*  Get Data for Housing Price Regressions

clear all

*read in data.  
clear
use SFBA_matched_88_08.dta

keep applicantrace coapplicantrace applicantsex coapplicantsex applicantincome  sr_buyer sr_seller sa_yr_blt_effect sa_yr_blt sr_full_part_code property_id ///
county sa_lotsize sa_sqft sa_nbr_bath sa_nbr_bedrms sa_nbr_rms sa_nbr_stories sa_nbr_units sr_unique_id sr_date_transfer sr_val_transfer sr_loan_val_1 sr_loan_val_2 sr_loan_val_3 ///
sa_x_coord sa_y_coord tract sr_mult_apn_flag_keyed arms_length_flag_dfs hiqualmatch medqualmatch loqualmatch flagcode

compress


gen double year = int(sr_date_transfer/10000)


keep if year > 1989

gen match_id = _n
sort match_id

*RENAME/GENERATE SOME OF THE OFT-USED VARIABLES
rename sr_val_transfer price
gen id =  property_id 

gen  fsr_date_transfer =.
gen  fsale = 0

sort  property_id sr_date_transfer
qui sum fsale
local i = 1
while `i' < r(N){
if (property_id[`i'] == property_id[`i'+1]){

qui replace fsr_date_transfer = sr_date_transfer[`i'+1] in `i'
qui replace fsale = 1 in `i'
}
local i = `i' + 1
} 

gen fyear = int(fsr_date_transfer/10000)
replace fyear = 0 if fyear ==.

destring sa_sqft sa_nbr_bath sa_nbr_bedrms sa_nbr_rms sa_nbr_stories sa_nbr_units, replace

******************SAMPLE CUTS*****************************************

*** Drop if "Indicates multiple parcel numbers keyed from the document".
drop if sr_mult_apn_flag_keyed=="M"

*** Drop arms length transactions
drop if arms_length_flag_dfs!="Y" 


*** Drop if buyer name equals seller name or if same buyer buys the property more than once
drop if sr_buyer==sr_seller
by id sr_buyer,sort: gen temp=_N
drop if temp>1
drop temp


*** Drop transactions suspected of improvements. Loan more than value of the house
replace sr_loan_val_1=0 if sr_loan_val_1==.
replace sr_loan_val_2=0 if sr_loan_val_2==.
replace sr_loan_val_3=0 if sr_loan_val_3==.
gen loanamount=sr_loan_val_1+sr_loan_val_2+sr_loan_val_3
gen temp=0
replace temp=1 if loanamount-price>5000
by id,sort: egen temp2=sum(temp)
drop if temp2>=1
drop temp temp2

********************************************************************
*DROP PARTIAL SALES
drop if sr_full_part_code == "P"

*DROP LAND SALES
drop if year<sa_yr_blt

*DROP MAJOR IMPROVEMENTS
gen sa_yr_blt_effect_temp = sa_yr_blt_effect
replace sa_yr_blt_effect_temp = sa_yr_blt if sa_yr_blt_effect_temp ==.
drop if year < sa_yr_blt_effect_temp

*DROP ALL OBS IF MULTIPLE SALES ON SAME DAY FOR DIFFERENT PRICES
duplicates tag id sr_date_transfer, gen(propdaytag)
egen stdprice = sd(price), by(id sr_date_transfer)
gen okflag = (propdaytag==0)
drop if stdprice>0 & okflag==0

*DROP DUPLICATE OBSERVATIONS IF MULTIPLE ENTRIES ON SAME DAY
duplicates drop id sr_date_transfer, force

*DROP IF MULTIPLE SALES WITHIN SAME YEAR
duplicates tag id year, gen(propyeartag)
drop if propyeartag>0

*DROP IF MORE THAN 5 SALES TOTAL
duplicates tag id, gen(numsalesminusone)
drop if numsalesminusone>4

**********************************************************************

**********************************************************************
*CONVERT TO 4 DIGIT, AND DROP PROBLEMS
replace tract = int(tract)
drop if tract<101

**********************************************************************
*SAMPLE CUTS

drop if sa_yr_blt ==.
drop if sa_yr_blt < 1850
drop if sa_lotsize ==.
drop if sa_lotsize > 3*43560
drop if sa_sqft == .
drop if sa_sqft == 0
drop if sa_sqft > 10000

drop if sa_nbr_bath==.
drop if sa_nbr_bath < 1
drop if sa_nbr_bath > 10
drop if sa_nbr_bedrms ==.
drop if sa_nbr_bedrms > 10
drop if sa_nbr_rms ==.
drop if sa_nbr_rms > 15
drop if sa_nbr_stories==.
drop if sa_nbr_stories > 3
drop if sa_nbr_units ==.
drop if sa_nbr_units ==0
drop if sa_nbr_units > 1

drop if sa_x_coord == 0
drop if sa_y_coord == 0
drop if sa_x_coord == .
drop if sa_y_coord == .

rename sa_x_coord longitude
rename sa_y_coord latitude

**********************************************************************
*KEEP DATA FROM 1990 TO 2008 ONLY
keep if year>1989

*GET ALL PRICES IN JANUARY 2000 DOLLARS
* change this for la quietly do cpi_la.do
qui do cpi.do
replace price = price/cpi

*BASIC PRICE DROPS
drop if price == 0
drop if price < 0
pctile pct = price, nq(100)
gen a=pct[1]
gen b=pct[99]
drop if price<a
drop if price>b
drop a b


*DROP ON APPRECIATION/DEPRECIATION
gen logprice = ln(price)
xi: reg logprice i.year
sort year county
save datatemp.dta, replace
clear
use datatemp.dta
statsby _b, by(county): areg logprice _Iyear*, absorb(id) cluster(id) 
reshape long _b_Iyear_, i(county) j(year)
rename _b_Iyear_ a
drop _b_cons
sort year county
save caseshiller_county.dta, replace
clear
use datatemp.dta
merge year county using caseshiller_county.dta
tab _merge
drop if _merge==2
replace a=0 if year==1990
replace a = a+1

egen ttt = rank(price), by(year county)
egen sss = max(ttt), by(year county)
gen ptile = ttt/sss

sort id sr_date_transfer
qui sum id
gen app = .
gen y1 = .
gen y2 = .
gen pt1 = .
gen pt2 = .
gen meanapp = .

local i = 2
while `i' < r(N) +1{
if id[`i'] == id[`i'-1]{
qui replace app = (price[`i']-price[`i'-1])/price[`i'-1] in `i'
qui replace y1 = year[`i'-1] in `i'
qui replace y2 = year[`i'] in `i'
qui replace pt1 = ptile[`i'-1] in `i'
qui replace pt2 = ptile[`i'] in `i'
qui replace meanapp = (a[`i']-a[`i'-1])/a[`i'-1] in `i'
}
local i = `i'+1
}



gen problemapp = 0
replace problemapp = 1 if (meanapp>0 & app>0 & abs(app-meanapp)>.5 & app~=.)
replace problemapp = 1 if (meanapp>0 & app<0 & abs(app-meanapp)>.5 & app~=.)
replace problemapp = 1 if (meanapp<0 & app>0 & abs(app-meanapp)>.5 & app~=.)
replace problemapp = 1 if (meanapp<0 & app<0 & abs(app-meanapp)>.5 & app~=.)

gen problemptile = 0
replace problemptile = 1 if (abs(pt2-pt1)>.4 & app~=.)

gen problem = (problemptile==1 | problemapp==1)
egen problemhouse = max(problem), by(id)

keep if problemhouse == 0

*************************************************************************************

* clean buyer names
gen t =1 if substr(sr_buyer, 1, 1) == "&" | substr(sr_buyer, 1, 1) == "'" | substr(sr_buyer, 1, 1) == "+" | substr(sr_buyer, 1, 1) == "," | substr(sr_buyer, 1, 1) == "-" | substr(sr_buyer, 1, 1) == "/" | substr(sr_buyer, 1, 1) == ";"
replace sr_buyer = substr(sr_buyer, 2, .) if t== 1
drop t 
gen t =1 if substr(sr_buyer, 1, 1) == "&" | substr(sr_buyer, 1, 1) == "'" | substr(sr_buyer, 1, 1) == "+" | substr(sr_buyer, 1, 1) == "," | substr(sr_buyer, 1, 1) == "-" | substr(sr_buyer, 1, 1) == "/" | substr(sr_buyer, 1, 1) == ";"
replace sr_buyer = substr(sr_buyer, 2, .) if t== 1
drop t 
gen t =1 if substr(sr_buyer, 1, 1) == "&" | substr(sr_buyer, 1, 1) == "'" | substr(sr_buyer, 1, 1) == "+" | substr(sr_buyer, 1, 1) == "," | substr(sr_buyer, 1, 1) == "-" | substr(sr_buyer, 1, 1) == "/" | substr(sr_buyer, 1, 1) == ";"
replace sr_buyer = substr(sr_buyer, 2, .) if t== 1
drop t 

* clean seller names
gen t =1 if substr(sr_seller, 1, 1) == "&" | substr(sr_seller, 1, 1) == "'" | substr(sr_seller, 1, 1) == "+" | substr(sr_seller, 1, 1) == "," | substr(sr_seller, 1, 1) == "-" | substr(sr_seller, 1, 1) == "/" | substr(sr_seller, 1, 1) == ";"
replace sr_seller = substr(sr_seller, 2, .) if t== 1
drop t 
gen t =1 if substr(sr_seller, 1, 1) == "&" | substr(sr_seller, 1, 1) == "'" | substr(sr_seller, 1, 1) == "+" | substr(sr_seller, 1, 1) == "," | substr(sr_seller, 1, 1) == "-" | substr(sr_seller, 1, 1) == "/" | substr(sr_seller, 1, 1) == ";"
replace sr_seller = substr(sr_seller, 2, .) if t== 1
drop t 
gen t =1 if substr(sr_seller, 1, 1) == "&" | substr(sr_seller, 1, 1) == "'" | substr(sr_seller, 1, 1) == "+" | substr(sr_seller, 1, 1) == "," | substr(sr_seller, 1, 1) == "-" | substr(sr_seller, 1, 1) == "/" | substr(sr_seller, 1, 1) == ";"
replace sr_seller = substr(sr_seller, 2, .) if t== 1
drop t 

* note:  strpos was called index in earlier stata versions.  

*break out buyer names into first name, last name etc
gen temp1 = strpos(sr_buyer, ",")
replace temp1 = . if temp1 == 0
gen buylastname = trim(substr(sr_buyer, 1, temp1-1))
gen temp2 =  strpos(sr_buyer, "&")
replace temp2 = . if temp2 == 0
gen buyname12 = trim(substr(sr_buyer, temp1+1, temp2-1-temp1))
gen temp3a = strpos(buyname12, " ")
replace temp3a =. if temp3a == 0
gen temp3b = strpos(buyname12, ".")
replace temp3b =. if temp3b == 0
egen temp3 = rmin(temp3a temp3b)
replace temp3 = . if temp3 == 0
gen buyfirstname = trim(substr(buyname12, 1, temp3-1))
gen buyinit = substr(buyfirstname,1,1)
gen buyfirst3 = substr(buyfirstname,1,3)

gen buysponame12 = trim(substr(sr_buyer, temp2+1, .))
gen temp4 = strpos(buysponame12, " ")
replace temp4 = . if temp4 == 0
gen buyspofirstname = trim(substr(buysponame12, 1, temp4-1))
gen buyspoinit = substr(buyspofirstname,1,1)

drop temp1 temp2 temp3 temp3a temp3b temp4

*break out seller names into first name, last name etc
gen temp1 = strpos(sr_seller, ",")
replace temp1 = . if temp1 == 0
gen selllastname = trim(substr(sr_seller, 1, temp1-1))
gen temp2 =  strpos(sr_seller, "&")
replace temp2 = . if temp2 == 0
gen sellname12 = trim(substr(sr_seller, temp1+1, temp2-1-temp1))
gen temp3a = strpos(sellname12, " ")
replace temp3a =. if temp3a == 0
gen temp3b = strpos(sellname12, ".")
replace temp3b =. if temp3b == 0
egen temp3 = rmin(temp3a temp3b)
replace temp3 = . if temp3 == 0
gen sellfirstname = trim(substr(sellname12, 1, temp3-1))
gen sellinit = substr(sellfirstname,1,1)
gen sellfirst3 = substr(sellfirstname,1,3)

gen sellsponame12 = trim(substr(sr_seller, temp2+1, .))
gen temp4 = strpos(sellsponame12, " ")
replace temp4 = . if temp4 == 0
gen sellspofirstname = trim(substr(sellsponame12, 1, temp4-1))
gen sellspoinit = substr(sellspofirstname,1,1)

drop temp1 temp2 temp3 temp3a temp3b temp4

* choose variable to base buyer name matches
egen buyer = concat( buylastname buyfirstname buyspofirstname)
replace buyer=lower(buyer)     
replace buyer=subinstr(buyer, ".", "", .)
replace buyer=subinstr(buyer, " ", "", .)

* choose variable to base seller name matches
egen seller = concat( selllastname sellfirstname sellspofirstname)
replace seller=lower(seller)     
replace seller=subinstr(seller, ".", "", .)
replace seller=subinstr(seller, " ", "", .)

*count how many times a buyer(seller) shows up overall and per year
duplicates tag buyer year, gen(tagbuyy)
duplicates tag buyer, gen(tagbuy)
duplicates tag seller year, gen(tagselly)
duplicates tag seller, gen(tagsell)
*duplicates tag property_id, gen(tagprop)

* does buyer's name include a number
gen numb = (substr(buyer,1,1)=="0" | substr(buyer,1,1)=="1" | substr(buyer,1,1)=="2" | substr(buyer,1,1)=="2" | substr(buyer,1,1)=="4" | substr(buyer,1,1)=="5" | substr(buyer,1,1)=="6" | substr(buyer,1,1)=="7" | substr(buyer,1,1)=="8" | substr(buyer,1,1)=="9") 
* does seller's name include a number
gen nums = (substr(seller,1,1)=="0" | substr(seller,1,1)=="1" | substr(seller,1,1)=="2" | substr(seller,1,1)=="2" | substr(seller,1,1)=="4" | substr(seller,1,1)=="5" | substr(seller,1,1)=="6" | substr(seller,1,1)=="7" | substr(seller,1,1)=="8" | substr(seller,1,1)=="9") 

*does the seller's name include "estate"
gen estates = (substr(seller,-6,6)=="estate" | substr(seller,1,6)=="estate")
*does the seller's name include "trust"
gen trusts = (substr(seller,-5,5)=="trust" | substr(seller,1,5)=="trust"| sellfirstname =="TR")
* assume seller is not an individual if no comma in name
gen indivs = (strpos(sr_seller, ",")~=0)

*Do the same for buyers as sellers
gen estateb = (substr(buyer,-6,6)=="estate" | substr(buyer,1,6)=="estate")
gen trustb = (substr(buyer,-5,5)=="trust" | substr(buyer,1,5)=="trust" | buyfirstname =="TR")
gen indivb = (strpos(sr_buyer, ",")~=0)

gen flaggoodbuy = (tagbuyy < 1 & tagbuy < 5 & buyer~="" & numb~=1 & trustb~=1 & estateb ~= 1 & indivb ~= 0)
gen flaggoodsell = (tagselly < 1 & tagsell < 5 & seller~="" & nums~=1 & trusts~=1 & estates ~= 1 & indivs ~= 0)

keep if flaggoodbuy == 1

*save dataprep_inter_b_sf, replace
********************************************************************************
*clear all
*use dataprep_inter_b_sf

keep   applicantrace coapplicantrace applicantsex coapplicantsex applicantincome match_id property_id longitude latitude tract county price year sa_yr_blt sa_lotsize sa_sqft sa_nbr_bath sa_nbr_bedrms sa_nbr_rms sa_nbr_stories sa_nbr_units  fsale fyear

gen house_age = year-sa_yr_blt

sum

duplicates tag property_id, gen(proptag)
gen numsell = proptag + 1

tab proptag

sum if proptag==0
sum if proptag>0
sum if proptag==1

drop if proptag>2

save bm_sf, replace

********************************************************************************
*  Get Data for Hedonic Regressions
clear all
use bm_sf

gen age = year - sa_yr_blt
gen logp = log(price)

drop if county == 75 | county ==.
gen market = county
replace market = 2 if county == 13
replace market = 3 if county == 41
replace market = 4 if county == 81
replace market = 5 if county == 85

sort property_id year
merge property_id year using sf_long_c
tab _merge
keep if _merge == 3
mvdecode _all, mv(-9999)

rename  vcrimerate v_crime

* generate amenity squared
gen hv_crime_sq = .5*v_crime*v_crime

duplicates tag tract, gen(tagtract)
drop if tagtract < 115

gen yeart = year -1989
egen tractid = group(tract)
drop if v_crime ==.
outfile v_crime hv_crime_sq logp market yeart  tractid age sa_lotsize sa_sqft sa_nbr_rms using hed_data_log, comma wide nolabel replace

********************************************************************************
*  Get Data for Rosen Inversion
clear all
use bm_sf  

drop if county == 75 | county ==.
gen market = county
replace market = 2 if county == 13
replace market = 3 if county == 41
replace market = 4 if county == 81
replace market = 5 if county == 85

sort property_id year
merge property_id year using sf_long_c

tab _merge
keep if _merge == 3
mvdecode _all, mv(-9999)

rename  vcrimerate v_crime

gen t = year-1989

drop if   applicantrace < 2
drop if   applicantrace > 5
drop if applicantincome < 25
drop if applicantincome > 500

keep if v_crime ~= .
outfile  market v_crime t applicantrace applicantincome price  using consump_data, comma wide nolabel replace 
